In [1]:
#import libraries
import numpy as np
import pandas as pd
import os

In [2]:
#read weather dataframe
weather = pd.read_csv(os.path.join("data", "weather.csv"), na_values=["M", "-", "*"])

In [3]:
for column in weather.columns:
    if sum(weather[column].isnull()) > 0:
        print column


tmax
tmin
tavg
depart
dewpoint
wetbulb
heat
cool
sunrise
sunset
snowfall
preciptotal
stnpressure
sealevel
resultspeed
resultdir
avgspeed

In [4]:
#Fill T values for snowfall with 0.01, meaning something is falling down but not that much
def change_snowfall(x):
    if x == "  T":
        return 0.01    
    else:
        return float(x)
    
weather["snowfall"] = weather["snowfall"].map(change_snowfall)

In [5]:
def change_preciptotal(x):
    if x == "  T":
        return 0.01   
    else:
        return float(x)
    
weather["preciptotal"] = weather["preciptotal"].map(change_preciptotal)

In [6]:
#separate codesum column into separate binary columns
codesum_columns = set(' '.join(set(weather["codesum"])).strip().split())
codesum = pd.DataFrame(index=weather.index, columns=codesum_columns)

In [7]:
for column in codesum.columns:
    for i in range(len(weather.index)):
        if column in weather["codesum"][i]:
            codesum[column][i] = 1

In [8]:
#drop initial codesum column
weather = weather.drop("codesum", 1)

In [9]:
weather = weather.join(codesum.fillna(0))

In [10]:
weather.head()


Out[10]:
station_nbr date tmax tmin tavg depart dewpoint wetbulb heat cool ... PRFG FG+ TSRA FZFG BLDU MIFG SQ BLSN SN SG
0 1 2012-01-01 52 31 42 NaN 36 40 23 0 ... 0 0 0 1 0 0 0 0 0 0
1 2 2012-01-01 48 33 41 16 37 39 24 0 ... 0 0 0 0 0 0 0 0 0 0
2 3 2012-01-01 55 34 45 9 24 36 20 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 2012-01-01 63 47 55 4 28 43 10 0 ... 0 0 0 0 0 0 0 0 0 0
4 6 2012-01-01 63 34 49 0 31 43 16 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 47 columns


In [11]:
def diff_dates_2015(date_x):
  date_format = "%m/%d/%Y"
  x = datetime.strptime(date_x, date_format)
  y = datetime.strptime('01/01/2015', date_format)
  delta = y - x
  return delta.days

I am adding column about number of days since beginning of the corresponding year


In [12]:
from datetime import datetime
def get_days(date_x):
  date_format = "%Y-%m-%d"
  x = datetime.strptime(date_x, date_format)
  y = datetime.strptime('{year}-01-01'.format(year=x.year), date_format)
  delta = x - y
  return delta.days

In [13]:
weather['days'] = weather['date'].map(get_days)

In [14]:
def in_minutes(x):        
    if np.isnan(x):        
        return np.nan    
    else:
        temp = int(x)
        if (str(temp)[-2:]) == '60':
            temp -= 1        
        b = datetime.strptime(str(temp), "%H%M")
        a = datetime.strptime('0000', "%H%M")                    
        return (b - a).total_seconds() / 60

In [15]:
weather["sunrise"] = weather['sunrise'].apply(in_minutes)

In [16]:
weather["sunset"] = weather['sunset'].apply(in_minutes)

In [17]:
weather.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20517 entries, 0 to 20516
Data columns (total 48 columns):
station_nbr    20517 non-null int64
date           20517 non-null object
tmax           19611 non-null float64
tmin           19609 non-null float64
tavg           19048 non-null float64
depart         9006 non-null float64
dewpoint       19851 non-null float64
wetbulb        19265 non-null float64
heat           19048 non-null float64
cool           19048 non-null float64
sunrise        10861 non-null float64
sunset         10861 non-null float64
snowfall       13293 non-null float64
preciptotal    19657 non-null float64
stnpressure    19588 non-null float64
sealevel       18793 non-null float64
resultspeed    19928 non-null float64
resultdir      19928 non-null float64
avgspeed       19642 non-null float64
HZ             20517 non-null int64
FU             20517 non-null int64
UP             20517 non-null int64
TSSN           20517 non-null int64
VCTS           20517 non-null int64
DZ             20517 non-null int64
BR             20517 non-null int64
FG             20517 non-null int64
BCFG           20517 non-null int64
DU             20517 non-null int64
FZRA           20517 non-null int64
TS             20517 non-null int64
RA             20517 non-null int64
PL             20517 non-null int64
GS             20517 non-null int64
GR             20517 non-null int64
FZDZ           20517 non-null int64
VCFG           20517 non-null int64
PRFG           20517 non-null int64
FG+            20517 non-null int64
TSRA           20517 non-null int64
FZFG           20517 non-null int64
BLDU           20517 non-null int64
MIFG           20517 non-null int64
SQ             20517 non-null int64
BLSN           20517 non-null int64
SN             20517 non-null int64
SG             20517 non-null int64
days           20517 non-null int64
dtypes: float64(17), int64(30), object(1)
memory usage: 7.7+ MB

In [18]:
#save to file
weather.to_csv(os.path.join("data", "weather_modified_4.csv"), index=False)

In [ ]: